library(data.table)
library(tibble)
library(GSEABase)
library(Matrix)
library(ggplot2)
library(SiPSiC)

# Constants Definition
minimalClusterSize = 10
logScalingConstant = 1
minNumOfGenesExpressed = 1000

findLineageForCell <- function(cellName, allcellLineages)
{
  currCellCoordinates <- allcellLineages[cellName,]
  cellYCoordinate <- as.numeric(currCellCoordinates[,"Y"])
  cellXCoordinate <- as.numeric(currCellCoordinates[,"X"])
  cellLineage <- "Oligodendrocyte"
  
  if (cellYCoordinate > 0)
  {
    cellLineage <- "cancerStemCell"
  }
  else if (cellXCoordinate < 0)
  {
    cellLineage <- "Astrocyte"
  }
  
  return (cellLineage)
}

filterData <- function(dataMatrix, isLogTPM, convertToCPM)
{
  filteredDataMatrix <- dataMatrix
  
  if (isLogTPM == TRUE)
  {
    filteredDataMatrix <- 2^(filteredDataMatrix) - logScalingConstant
  }
  
  # Filtering out cells which express less than the minimal number of genes
  expressedGenesCounters <- apply(filteredDataMatrix != 0, 2, sum)
  cellsWithAThousandPlus <- expressedGenesCounters >= minNumOfGenesExpressed
  filteredDataMatrix <- filteredDataMatrix[, cellsWithAThousandPlus]
  expressedGenesCounters <- expressedGenesCounters[cellsWithAThousandPlus]
  
  # Filtering out genes which are expressed by less than the minimal expected cluster size of cells
  nonZeroCellCountsForGenes <- apply(filteredDataMatrix != 0, 1, sum)
  totalCellsCount <- ncol(filteredDataMatrix)
  minNumOfCellsInClust <- totalCellsCount * (minimalClusterSize / 100)
  genesWithMinExpression <- (nonZeroCellCountsForGenes > minNumOfCellsInClust)
  filteredDataMatrix <- filteredDataMatrix[genesWithMinExpression,]
  
  # Converting the transcript counts to CPM
  if (convertToCPM == TRUE)
  {
    countSumsOfCells <- apply(filteredDataMatrix, 2, sum)
    filteredDataMatrix <- t(filteredDataMatrix)
    filteredDataMatrix <- (filteredDataMatrix / countSumsOfCells) * 1000000
    filteredDataMatrix <- t(filteredDataMatrix)
  }
  
  return (filteredDataMatrix)
}

# This function produces graphic representation of the pathway score differences between the cell groups
executePathwayCalculations <- function(inputPathway, dataMatrix, astrocyteCellNames, oligoCellNames, cancerStemCellNames, malignantCellNames)
{
  pathwayGenes <- inputPathway@geneIds
  pathwayName <- inputPathway@setName
  pathwayScores <- try(getPathwayScores(dataMatrix, pathwayGenes))
  
  scoresAsDataFrame <- as.data.frame(pathwayScores$pathwayScore)
  colnames(scoresAsDataFrame)[1] <- "Score"
  
  currPathwayScores <- scoresAsDataFrame[malignantCellNames, "Score"]
  names(currPathwayScores) <- malignantCellNames
  
  allPathwayScores <<- rbind(allPathwayScores, currPathwayScores)
  rownames(allPathwayScores)[nrow(allPathwayScores)] <<- pathwayName
  
  scoresAsDataFrame$MaligCellType <- "Astrocyte"
  scoresAsDataFrame[rownames(scoresAsDataFrame) %in% oligoCellNames, "MaligCellType"] <- "Oligodendrocyte"
  scoresAsDataFrame[rownames(scoresAsDataFrame) %in% cancerStemCellNames, "MaligCellType"] <- "CancerStemCell"
  
  # Performing the T test to compare the cells of the different groups
  T.TestResult <- pairwise.t.test(scoresAsDataFrame$Score, scoresAsDataFrame$MaligCellType, p.adjust.method = "none")
  
  # Fetching and storing P values of the T test
  astro_vs_oligo_P_Val <- T.TestResult[[3]]["Oligodendrocyte", "Astrocyte"]
  astro_vs_stem_P_Val <- T.TestResult[[3]]["CancerStemCell", "Astrocyte"]
  oligo_vs_stem_P_Val <- T.TestResult[[3]]["Oligodendrocyte", "CancerStemCell"]
  
  all_P_Values_astro_vs_oligo[pathwayName] <<- astro_vs_oligo_P_Val
  all_P_Values_astro_vs_stem[pathwayName] <<- astro_vs_stem_P_Val
  all_P_Values_oligo_vs_stem[pathwayName] <<- oligo_vs_stem_P_Val

  # Calculating and storing effect size differences
  astroScores <- scoresAsDataFrame[scoresAsDataFrame[,"MaligCellType"] == "Astrocyte","Score"]
  oligoScores <- scoresAsDataFrame[scoresAsDataFrame[,"MaligCellType"] == "Oligodendrocyte","Score"]
  stemCellScores <- scoresAsDataFrame[scoresAsDataFrame[,"MaligCellType"] == "CancerStemCell","Score"]
  
  astro_median <- median(astroScores)
  oligo_median <- median(oligoScores)
  stemCell_median <- median(stemCellScores)
   
  all_effect_sizes_astro_vs_oligo[pathwayName] <<- astro_median - oligo_median
  all_effect_sizes_astro_vs_stem[pathwayName] <<- astro_median - stemCell_median
  all_effect_sizes_oligo_vs_stem[pathwayName] <<- oligo_median - stemCell_median
  
  violinPlot <- ggplot(scoresAsDataFrame, aes(x = MaligCellType, y = Score, fill = MaligCellType)) +
    ggtitle(paste0("T-Test astro vs. oligo (Unadjusted!): P < ", astro_vs_oligo_P_Val, "\n",
                   "T-Test astro vs. CancerStem (Unadjusted!): P < ", astro_vs_stem_P_Val, "\n",
                   "T-Test oligo vs. CancerStem (Unadjusted!): P < ", oligo_vs_stem_P_Val, "\n",
                   "Effect size astro vs. oligo: ", all_effect_sizes_astro_vs_oligo[pathwayName], "\n",
                   "Effect size astro vs. CancerStem: ", all_effect_sizes_astro_vs_stem[pathwayName], "\n",
                   "Effect size oligo vs. CancerStem: ", all_effect_sizes_oligo_vs_stem[pathwayName])) +
    geom_violin(trim=FALSE) + geom_boxplot(width=0.1)

  pdf(paste0(pathwayName, ".pdf"))
  print(violinPlot)
  dev.off()
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
############################################# MAIN #############################################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

setwd("")

GMT_FILE_NAME <- "h.all.v7.0.symbols.pluscc.gmt"
hallmarkGenesets <- getGmt(GMT_FILE_NAME)

cellTranscriptomes <- as.data.frame(as_tibble(fread("OG_processed_data_portal.txt", sep = "\t", header = TRUE)))
rownames(cellTranscriptomes) <- cellTranscriptomes[,"GENE"]
cellTranscriptomes <- cellTranscriptomes[,-1]

maligCellLineage <- read.delim(file = "dif_stem_coordinates_portal2.txt", header = TRUE, sep = "\t")
maligCellLineage <- maligCellLineage[-1,]
rownames(maligCellLineage) <- maligCellLineage[,"NAME"]
maligCellLineage <- maligCellLineage[,-1]

cellAnnotations <- read.delim(file = "cell_type_assignment_portal.txt", header = TRUE, sep = "\t")
cellAnnotations <- cellAnnotations[cellAnnotations[,"SUB.CLUSTER"] != "0" &
                                   cellAnnotations[,"SUB.CLUSTER"] != "group",]

# Only keeping cells for which a cell annotation exists
cellTranscriptomes <- cellTranscriptomes[,colnames(cellTranscriptomes) %in% cellAnnotations[,"NAME"]]
malignantAnnotation <- cellAnnotations[cellAnnotations[,"SUB.CLUSTER"] == "malignant",]

malignantData <- cellTranscriptomes[,colnames(cellTranscriptomes) %in% malignantAnnotation[,"NAME"]]
malignantData <- malignantData[,colnames(malignantData) %in% rownames(maligCellLineage)]
malignantData[is.na(malignantData)] <- 0 # There are, indeed, NAs in the data before this command is executed

malignantData <- filterData(dataMatrix = malignantData, isLogTPM = TRUE, convertToCPM = FALSE)
malignantCellNames <- colnames(malignantData)

# Reordering the list of malignant cell coordinates according to the list of malignant cell names, 
# then determining the lineage of each malignant cell
maligCellLineage <- maligCellLineage[malignantCellNames,]
cellLineages <- lapply(malignantCellNames, findLineageForCell, maligCellLineage)
cellLineages <- do.call(rbind, cellLineages)
rownames(cellLineages) <- malignantCellNames
colnames(cellLineages)[1] <- "Lineage"

astrocyteCellNames <- rownames(cellLineages)[cellLineages[,"Lineage"] == "Astrocyte"]
oligoCellNames <- rownames(cellLineages)[cellLineages[,"Lineage"] == "Oligodendrocyte"]
cancerStemCellNames <- rownames(cellLineages) [cellLineages[,"Lineage"] == "cancerStemCell"]

malignantData <- Matrix(as.matrix(malignantData), sparse = TRUE)

all_P_Values_astro_vs_oligo <- numeric()
all_P_Values_astro_vs_stem <- numeric()
all_P_Values_oligo_vs_stem <- numeric()

all_effect_sizes_astro_vs_oligo <- numeric()
all_effect_sizes_astro_vs_stem <- numeric()
all_effect_sizes_oligo_vs_stem <- numeric()

allPathwayScores <- numeric()

lapply(hallmarkGenesets, executePathwayCalculations, malignantData, astrocyteCellNames, oligoCellNames, cancerStemCellNames, malignantCellNames)

write.csv2(all_effect_sizes_astro_vs_oligo, file = "Effect_Size_Astro_vs_Oligo.csv")
write.csv2(all_effect_sizes_astro_vs_stem, file = "Effect_Size_Astro_vs_Stem.csv")
write.csv2(all_effect_sizes_oligo_vs_stem, file = "Effect_Size_Oligo_vs_Stem.csv")

write.csv2(all_P_Values_astro_vs_oligo, file = "Unadjusted_p_values_Astro_vs_Oligo.csv")
FDR_Adjusted_P_Vals <- p.adjust(all_P_Values_astro_vs_oligo, method = "BH", n = length(all_P_Values_astro_vs_oligo))
write.csv2(FDR_Adjusted_P_Vals, file = "FDR_values_Astro_vs_Oligo.csv")

write.csv2(all_P_Values_astro_vs_stem, file = "Unadjusted_p_values_Astro_vs_Stem.csv")
FDR_Adjusted_P_Vals <- p.adjust(all_P_Values_astro_vs_stem, method = "BH", n = length(all_P_Values_astro_vs_stem))
write.csv2(FDR_Adjusted_P_Vals, file = "FDR_values_Astro_vs_Stem.csv")

write.csv2(all_P_Values_oligo_vs_stem, file = "Unadjusted_p_values_Oligo_vs_Stem.csv")
FDR_Adjusted_P_Vals <- p.adjust(all_P_Values_oligo_vs_stem, method = "BH", n = length(all_P_Values_oligo_vs_stem))
write.csv2(FDR_Adjusted_P_Vals, file = "FDR_values_Oligo_vs_Stem.csv")

saveRDS(allPathwayScores, file = "allPathwayScores.RDS")